Below, the packages required for data analysis and visualization are loaded.
library(tidyverse)
library(magrittr)
library(DBI)
library(dbplyr)
library(RMariaDB)
library(data.table)
library(stopwords)
library(tidytext)
library(RColorBrewer)
library(DT)
library(wordcloud)
W. Edwards Deming said, “In God we trust, all others must bring data.” Below, we will use data to explore the question, “Which are the most valued data science skills?”
con <- DBI::dbConnect(
RMariaDB::MariaDB(),
dbname = "dat_sci_jobs",
username = "root",
password = as.character(read.table("sql_db.txt", header = FALSE)),
host = "35.227.102.234")
tables <- dbListTables(con)
jobs_df <- dbReadTable(con, "_Jobs")
datatable(jobs_df[, -3], options = list(pageLength = 25))
dbDisconnect(con)
completed_files <- readLines("completed_files.txt")
url_base <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/"
new_jobs_df <- as.data.frame(matrix(nrow = 0, ncol = 11))
files <- list.files(pattern = "_feeds_.*csv$")
for (i in 1:length(files)){
if (!(files[i] %in% completed_files)){
file <- paste(url_base, files[i], sep = "")
csv <- read.csv(file = file, header = TRUE)
new_jobs_df <- rbind(new_jobs_df, csv)
completed_files <- append(completed_files, files[i])
}
}
new_jobs_df2 <- as.data.frame(matrix(nrow = 0, ncol = 10))
files <- list.files(pattern = "_linkedin_.*csv$")
for (i in 1:length(files)){
if (!(files[i] %in% completed_files)){
file <- paste(url_base, files[i], sep = "")
csv <- read.csv(file = file, header = TRUE)
new_jobs_df2 <- rbind(new_jobs_df2, csv)
completed_files <- append(completed_files, files[i])
}
}
writeLines(completed_files, "completed_files.txt")
new_jobs_df <- new_jobs_df[!duplicated(new_jobs_df), ]
new_jobs_df2 <- new_jobs_df2[!duplicated(new_jobs_df2), ]
if (nrow(new_jobs_df) > 0){
new_jobs_df <- subset(new_jobs_df, select = -c(X, author, summary,
content, extracted_content_url, published, created_at))
cols <- c("Job_id", "Site_id", "Job_title", "Job_url")
colnames(new_jobs_df) <- cols
Job_company <- as.data.frame(matrix(NA, nrow = nrow(new_jobs_df),
ncol = 1))
colnames(Job_company) <- "Job_company"
Job_location <- as.data.frame(matrix(NA, nrow = nrow(new_jobs_df),
ncol = 1))
colnames(Job_location) <- "Job_location"
new_jobs_df <- cbind(new_jobs_df, Job_company, Job_location)
rownames(new_jobs_df) <- NULL
new_jobs_df <- new_jobs_df[c("Job_id", "Job_title", "Job_url",
"Job_company", "Job_location", "Site_id")]
new_jobs_df %<>%
mutate(Job_complete = 0)
}
if (nrow(new_jobs_df2) > 0){
new_jobs_df2 <- subset(new_jobs_df2, select = -c(X, job_url, company_url,
linkedin_company_url_cleaned, posted_date, normalized_company_name))
cols <- c("Job_url", "Job_company", "Job_title", "Job_location")
colnames(new_jobs_df2) <- cols
Job_id <- as.data.frame(matrix(NA, nrow = nrow(new_jobs_df2),
ncol = 1))
colnames(Job_id) <- "Job_id"
Site_id <- as.data.frame(matrix(1001, nrow = nrow(new_jobs_df2),
ncol = 1))
colnames(Site_id) <- "Site_id"
new_jobs_df2 <- cbind(new_jobs_df2, Job_id, Site_id)
rownames(new_jobs_df2) <- NULL
new_jobs_df2 <- new_jobs_df2[c("Job_id", "Job_title", "Job_url",
"Job_company", "Job_location", "Site_id")]
new_jobs_df2 %<>%
mutate(Job_complete = 0)
}
if (nrow(new_jobs_df) > 0){
jobs_df <- rbindlist(list(jobs_df, new_jobs_df))[!duplicated(Job_url)]
}
if (nrow(new_jobs_df2) > 0){
jobs_df <- rbindlist(list(jobs_df, new_jobs_df2))[!duplicated(Job_url)]
}
jobs_df %<>%
mutate(Job_id = row_number())
for (i in 1:nrow(jobs_df)){
httr::user_agent("Glen Davis")
if (jobs_df[i, 7] == 0){
dat <- try(xml2::read_html(jobs_df$Job_url[[i]]), silent = TRUE)
if (inherits(dat, "try-error", which = FALSE)){
jobs_df[i, 7] <- -1
next
}
}else{
next
}
if (jobs_df[i, 6] == 2594160){ #ai-jobs.net is source
desc <- xml2::xml_find_all(
dat, "//div[contains(@id, 'job-description')]")
}
else if (jobs_df[i, 6] == 977141){ #python.org is source
desc <- xml2::xml_find_all(
dat, "//div[contains(@class, 'job-description')]")
}
else if (jobs_df[i, 6] == 2594162){ #careercast it & eng is source
desc <- xml2::xml_find_all(
dat, "//div[contains(@class, 'arDetailDescriptionRow')]")
}
else if (jobs_df[i, 6] == 1378327){ #jobs for r-users is source
desc <- xml2::xml_find_all(
dat, "//div[contains(@class, 'section_content')]")
}
else if (jobs_df[i, 6] == 2593879){ #Indeed is source
jobs_df[i, 7] <- -1
next
}
else if (jobs_df[i, 6] == 2594166){ #Open Data Science is source
desc <- xml2::xml_find_all(
dat, "//div[contains(@class, 'job-desc')]")
}
else if (jobs_df[i, 6] == 2594174){ #MLconf is source
desc <- xml2::xml_find_all(
dat, "//div[contains(@class, 'job_description')]")
}
else if (jobs_df[i, 6] == 1001){ #Linkedin is source
jobs_df[i, 7] <- -1
next
}
desc <- xml2::xml_text(desc)
fn <- paste(jobs_df[i, 1], ".txt", sep = "")
writeLines(desc, con = fn)
jobs_df[i, 7] <- 1
}
manual <- jobs_df %>%
filter(Job_complete == -1 & Site_id == 1001)
write.csv(manual, "manual.csv", row.names = FALSE)
We look up the job descriptions for the job listings in the manual.csv file manually, and we save them as column eight of a manual_edited.csv file. If we find a job description, we change the Job_complete value to 1. If we don’t, we can just delete that row. If a previous file exists, we save over it. We then upload the saved manual_edited.csv file to github before continuing.
file <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/manual_edited.csv"
manual_edited <- read.csv(file = file, header = TRUE)
for (i in 1:nrow(manual_edited)){
job_id <- manual_edited[i, 1]
if (jobs_df[job_id, 7] != 1){
job_desc <- manual_edited[i, 8]
jobs_df[job_id, 7] <- manual_edited[i, 7]
fn <- paste(job_id, ".txt", sep = "")
writeLines(job_desc, con = fn)
}
}
completed_files <- readLines("completed_files.txt")
if (!("data_science_jobs_indeed_usa.csv" %in% completed_files)){
file <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/data_science_jobs_indeed_usa.csv"
kaggle_indeed <- read.csv(file = file, header = TRUE)
kaggle_indeed <- subset(kaggle_indeed, select = -c(1, 5, 6, 7, 8))
cols <- c("Job_title", "Job_company", "Job_location", "Job_url",
"Job_description")
colnames(kaggle_indeed) <- cols
ids <- seq((nrow(jobs_df) + 1), (nrow(jobs_df) + nrow(kaggle_indeed)))
Job_id <- as.data.frame(matrix(ids, nrow = nrow(kaggle_indeed),
ncol = 1))
Site_id <- as.data.frame(matrix(2593879, nrow = nrow(kaggle_indeed),
ncol = 1))
Job_complete <- as.data.frame(matrix(0, nrow = nrow(kaggle_indeed),
ncol = 1))
colnames(Site_id) <- "Site_id"
colnames(Job_id) <- "Job_id"
colnames(Job_complete) <- "Job_complete"
kaggle_indeed <- cbind(kaggle_indeed, Job_id, Site_id, Job_complete)
rownames(kaggle_indeed) <- NULL
kaggle_indeed <- kaggle_indeed[c("Job_id", "Job_title", "Job_url",
"Job_company", "Job_location",
"Site_id", "Job_complete",
"Job_description")]
jobs_df <- rbind(jobs_df, subset(kaggle_indeed, select = 1:7))
for (i in 1:nrow(kaggle_indeed)){
job_id <- kaggle_indeed[i, 1]
job_desc <- kaggle_indeed[i, 8]
jobs_df[job_id, 7] <- 1
fn <- paste(job_id, ".txt", sep = "")
writeLines(job_desc, con = fn)
}
write("data_science_jobs_indeed_usa.csv", file = "completed_files.txt",
append = TRUE)
}
con <- DBI::dbConnect(
RMariaDB::MariaDB(),
dbname = "dat_sci_jobs",
username = "root",
password = as.character(read.table("sql_db.txt", header = FALSE)),
host = "35.227.102.234")
tables <- dbListTables(con)
copy <- dbReadTable(con, "_Jobs")
if (!identical(jobs_df, copy)){
dbWriteTable(con, "_Jobs", jobs_df, overwrite = TRUE)
}
dbDisconnect(con)
files <- list.files(pattern = "^[1-9]+.*txt$")
if (length(files) > 0){
file.copy(from = paste0(getwd(), "/", files),
to = paste0(getwd(), "/jobs-txt/", files))
file.remove(from = paste0(getwd(), "/", files))
}
files <- list.files(path = paste0(getwd(), "/jobs-txt/"),
pattern = "^[1-9]+.*txt$")
cols <- c("Text", "Job_id", "Line")
completed_txt_files <- readLines("completed_txt_files.txt")
if (length(completed_txt_files) == 0){
text_df <- as.data.frame(matrix(nrow = 0, ncol = 3))
colnames(text_df) <- cols
}else{
my_url <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/text_df.csv"
text_df <- read.csv(file = my_url, header = TRUE,
fileEncoding = "UTF-8")
}
url_base <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/jobs-txt/"
for (i in 1:length(files)){
if (!files[i] %in% completed_txt_files){
file <- paste(url_base, files[i], sep = "")
job_id <- str_replace(files[i], ".txt", "")
lines <- readLines(file)
for (j in 1:length(lines)){
col2 <- matrix(job_id, nrow = length(lines), ncol = 1)
col3 <- matrix(1:length(lines),
nrow = length(lines),
ncol = 1)
}
addition <- cbind(lines, col2, col3)
colnames(addition) <- cols
text_df <- rbind(text_df, addition)
write(files[i], file = "completed_txt_files.txt", append = TRUE)
}
}
rownames(text_df) <- NULL
write.csv(text_df, "text_df.csv", row.names = FALSE)
text_df_clean <- text_df
text_df_clean[, 1] <- tolower(text_df_clean[, 1])
text_df_clean %<>%
filter(Text != "")
tidy_text_df_words <- text_df_clean %>%
unnest_tokens(word, Text)
tidy_text_words_analysis <- tidy_text_df_words %>%
anti_join(get_stopwords()) %>%
group_by(word) %>%
summarize(term_freq = n(),
doc_count = n_distinct(Job_id),
tf_dc_score = round((term_freq * doc_count / 1000000), 3)) %>%
arrange(desc(tf_dc_score))
## Joining with `by = join_by(word)`
datatable(tidy_text_words_analysis, options = list(pageLength = 25))
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
tidy_text_words_analysis %>%
with(wordcloud(word, doc_count, max.words = 50))
## Warning in wordcloud(word, doc_count, max.words = 50): business could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): development could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): required could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): ability could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): degree could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): team could not be fit on
## page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): computer could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): data could not be fit on
## page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): skills could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): opportunity could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): environment could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): technical could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): experience could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): support could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): strong could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): years could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): analytics could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): benefits could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): information could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): analysis could not be
## fit on page. It will not be plotted.
tidy_text_df_bigrams <- text_df_clean %>%
unnest_tokens(bigram, Text, token = "ngrams", n = 2)
tidy_text_bigrams_analysis <- tidy_text_df_bigrams %>%
separate(bigram, into = c("first","second"),
sep = " ", remove = FALSE) %>%
anti_join(stop_words, by = c("first" = "word")) %>%
anti_join(stop_words, by = c("second" = "word")) %>%
group_by(bigram) %>%
summarize(term_freq = n(),
doc_count = n_distinct(Job_id),
tf_dc_score = round((term_freq * doc_count / 1000000), 3)) %>%
filter(!is.na(bigram)) %>%
arrange(desc(tf_dc_score))
datatable(tidy_text_bigrams_analysis, options = list(pageLength = 25))
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
tidy_text_bigrams_analysis %>%
with(wordcloud(bigram, doc_count, max.words = 50))
## Warning in wordcloud(bigram, doc_count, max.words = 50): opportunity employer
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): data analysis could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): cross functional could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): computer science could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): machine learning could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): receive consideration
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): data scientists could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): data science could not
## be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): related field could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): job description could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): color religion could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): national origin could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): bachelor's degree
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): software development
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): gender identity could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): equal opportunity
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): marital status could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): orientation gender
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): data scientist could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): religion sex could not
## be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): cutting edge could not
## be fit on page. It will not be plotted.
tidy_text_df_trigrams <- text_df_clean %>%
unnest_tokens(trigram, Text, token = "ngrams", n = 3)
tidy_text_trigrams_analysis <- tidy_text_df_trigrams %>%
separate(trigram, into = c("first","second","third"),
sep = " ", remove = FALSE) %>%
anti_join(stop_words, by = c("first" = "word")) %>%
anti_join(stop_words, by = c("third" = "word")) %>%
group_by(trigram) %>%
summarize(term_freq = n(),
doc_count = n_distinct(Job_id),
tf_dc_score = round((term_freq * doc_count / 1000000), 3)) %>%
filter(!is.na(trigram)) %>%
arrange(desc(tf_dc_score))
datatable(tidy_text_trigrams_analysis, options = list(pageLength = 25))
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html